Attribute Information:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
Approach :
Data skimmed through to see what are the variables present, data type, shape, column names, mixed data types, missing values etc
dataset = pd.read_csv('Data - Parkinsons.csv')
dataset.head()
dataset.info()
dataset.isna().values.any()
dataset.isnull().values.any()
dataset.shape
dataset.status.value_counts()
## Sample bias observed
dataset["status"] = pd.Categorical(dataset["status"])
dataset.info()
Observations from first look at the Parkinson's dataset:
dataset.describe().T
def univariate_plots(Source):
a = pd.Series(Source.select_dtypes(include=['float64']).columns)
leng = len(a)
for j in range(0,len(a)):
plt.Text('Figure for float64')
f, axes = plt.subplots(1, 2, figsize=(10, 10))
sns.boxplot(Source[a[j]], ax = axes[0])
sns.distplot(Source[a[j]], ax = axes[1])
plt.subplots_adjust(top = 1.5, right = 10, left = 8, bottom = 1)
univariate_plots(dataset)
Observations:
def EDA_Corr(df):
"""This gives output as Covariance matrix and feature wise uniquess i.e how much its statistically
independent. This is done with default range of corr between +0.5 to -0.6"""
corr = df.corr()
index = corr.columns
Output = []
for i in range(0,len(index)):
i = index[i]
Pos = corr.index[(corr[i] >= 0.5)].tolist()
No = corr.index[(corr[i] < 0.5) & (corr[i] > -0.6)].tolist()
Neg = corr.index[(corr[i] <= -0.5)].tolist()
leng_u = len(No)
leng_pos = len(Pos)
leng_neg = len(Neg)
Out = [i, leng_u, leng_pos, leng_neg, Pos, Neg, No]
Output.append(Out)
fig, ax = plt.subplots(figsize=(20,10))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr,annot=True,vmin=-1,vmax=1,cmap=cmap, linewidths=0, ax = ax)
EDA_Corr(dataset)
temp_df = dataset.drop(['name','status'], axis='columns')
sns.pairplot(temp_df, diag_kind='kde')
sns.countplot(dataset.status)
Observations:
X = dataset.drop(['name','status'], axis='columns')
y = dataset.status
X.head(1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
print(dataset.isna().values.any())
print(dataset.isnull().values.any())
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
log_score_train = log_model.score(X_train, y_train)
print('Logistic Regression model score (Training Data):', log_score_train)
log_score = log_model.score(X_test, y_test)
print('Logistic Regression model score (Testing Data):', log_score)
log_pred = log_model.predict(X_test)
log_cm = confusion_matrix(y_test, log_pred)
print('True Possitive = ', log_cm[1][1])
print('True Negative = ', log_cm[0][0])
print('False Possive = ', log_cm[0][1])
print('False Negative = ', log_cm[1][0])
print("Logistic Regression - Classification Report")
print(classification_report(y_test, log_pred, labels=[1, 0]))
probs = log_model.predict_proba(X_test)
preds = probs[:,1]
Logistic_Accuracy = accuracy_score(y_test,log_pred)
print('Logistic_Accuracy =', Logistic_Accuracy)
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
Logistic_Gini = 2*roc_auc - 1
print('Logistic_Gini =', Logistic_Gini)
Logistic_AUC = roc_auc
print('Logistic_AUC =', Logistic_AUC)
sns.heatmap(confusion_matrix(y_test,log_pred), annot=True, cmap='Blues',fmt='g')
prec_log = (log_cm[1][1])/(log_cm[1][1] + log_cm[0][1])
rec_log = (log_cm[1][1])/(log_cm[1][1] + log_cm[1][0])
F1_log = 2*(prec_log*rec_log)/(prec_log + rec_log)
print('Precision for Logistic regression :', round(prec_log,2))
print('Recall for Logistic regression :', round(rec_log, 2))
print('F1 score for Logistic regression :', round(F1_log, 2))
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_score_train = nb_model.score(X_train, y_train)
print('NB model score (Training Data):', nb_score_train)
nb_score = nb_model.score(X_test, y_test)
print('NB model score (Testing Data):', nb_score)
nb_pred = nb_model.predict(X_test)
nb_cm = confusion_matrix(y_test, nb_pred)
print('True Possitive = ', nb_cm[1][1])
print('True Negative = ', nb_cm[0][0])
print('False Possive = ', nb_cm[0][1])
print('False Negative = ', nb_cm[1][0])
print("NB - Classification Report")
print(classification_report(y_test, nb_pred, labels=[1, 0]))
probs = nb_model.predict_proba(X_test)
preds = probs[:,1]
NB_Accuracy = accuracy_score(y_test,nb_pred)
print('NB_Accuracy =', NB_Accuracy)
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
NB_Gini = 2*roc_auc - 1
print('NB_Gini =', NB_Gini)
NB_AUC = roc_auc
print('NB_AUC =', NB_AUC)
sns.heatmap(confusion_matrix(y_test,nb_pred), annot=True, cmap='Reds',fmt='g')
prec_nb = (nb_cm[1][1])/(nb_cm[1][1] + nb_cm[0][1])
rec_nb = (nb_cm[1][1])/(nb_cm[1][1] + nb_cm[1][0])
F1_nb = 2*(prec_nb*rec_nb)/(prec_nb + rec_nb)
print('Precision for Naive Bayes :', round(prec_nb,2))
print('Recall for Naive Bayes :', round(rec_nb, 2))
print('F1 score for Naive Bayes :', round(F1_nb, 2))
svm_model = SVC(C = 30, degree = 3, gamma = "auto", kernel = "rbf", probability = True)
svm_model.fit(X_train, y_train)
svm_score_rbf_train = svm_model.score(X_train, y_train)
print('SVM model score (Training Data):', svm_score_rbf_train)
svm_score_rbf = svm_model.score(X_test, y_test)
print('SVM model score (Testing Data):', svm_score_rbf)
svm_pred = svm_model.predict(X_test)
svm_cm = confusion_matrix(y_test, svm_pred)
print('True Possitive = ', svm_cm[1][1])
print('True Negative = ', svm_cm[0][0])
print('False Possive = ', svm_cm[0][1])
print('False Negative = ', svm_cm[1][0])
print("SVC - Classification Report")
print(classification_report(y_test, svm_pred, labels=[1, 0]))
probs = svm_model.predict_proba(X_test)
preds = probs[:,1]
SVM_Accuracy = accuracy_score(y_test, svm_pred)
print('SVM_Accuracy =', SVM_Accuracy)
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
SVM_Gini = 2*roc_auc - 1
print('SVM_Gini =', SVM_Gini)
SVM_AUC = roc_auc
print('SVM_AUC =', SVM_AUC)
sns.heatmap(confusion_matrix(y_test,svm_pred), annot=True, cmap='Greens',fmt='g')
prec_svm_r = (svm_cm[1][1])/(svm_cm[1][1] + svm_cm[0][1])
rec_svm_r = (svm_cm[1][1])/(svm_cm[1][1] + svm_cm[1][0])
F1_svm_r = 2*(prec_svm_r * rec_svm_r)/(prec_svm_r + rec_svm_r)
print('Precision for SVM(rbf) :', round(prec_svm_r,2))
print('Recall for SVM(rbf) :', round(rec_svm_r, 2))
print('F1 score for SVM(rbf) :', round(F1_svm_r, 2))
svm_model_l = SVC(C = 30, degree = 3, gamma = "auto", kernel = "linear", probability = True)
svm_model_l.fit(X_train, y_train)
svm_score_linear_train = svm_model_l.score(X_train, y_train)
print('SVM model score (Training Data):', svm_score_linear_train)
svm_linear_score = svm_model_l.score(X_test, y_test)
print('SVM model score (Testing Data):', svm_linear_score)
svm_pred = svm_model_l.predict(X_test)
svm_cm = confusion_matrix(y_test, svm_pred)
print('True Possitive = ', svm_cm[1][1])
print('True Negative = ', svm_cm[0][0])
print('False Possive = ', svm_cm[0][1])
print('False Negative = ', svm_cm[1][0])
print("SVC - Classification Report")
print(classification_report(y_test, svm_pred, labels=[1, 0]))
probs = svm_model_l.predict_proba(X_test)
preds = probs[:,1]
SVM_Accuracy_Linear = accuracy_score(y_test, svm_pred)
print('SVM_Accuracy_Linear =', SVM_Accuracy_Linear)
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
SVM_Gini_Linear = 2*roc_auc - 1
print('SVM_Gini_Linear =', SVM_Gini_Linear)
SVM_AUC_Linear = roc_auc
print('SVM_AUC_Linear =', SVM_AUC_Linear)
sns.heatmap(confusion_matrix(y_test,svm_pred), annot=True, cmap='rocket_r',fmt='g')
prec_svm_l = (svm_cm[1][1])/(svm_cm[1][1] + svm_cm[0][1])
rec_svm_l = (svm_cm[1][1])/(svm_cm[1][1] + svm_cm[1][0])
F1_svm_l = 2*(prec_svm_l * rec_svm_l)/(prec_svm_l + rec_svm_l)
print('Precision for SVM(linear) :', round(prec_svm_l,2))
print('Recall for SVM(linear) :', round(rec_svm_l, 2))
print('F1 score for SVM(linear) :', round(F1_svm_l, 2))
We are using Stacking methods with traditional Supervised learning algorithms - Logistic Regression, Naive Bayes, SVM classifiers with kerner as rbf and linear. We are using the VotingClassifier from sklearn's ensemble set.
sclf = VotingClassifier(estimators=[('log',log_model), ('NB', nb_model), ('SVMR', svm_model),
('SVML', svm_model_l)], voting='soft')
sclf.fit(X_train, y_train)
sclf_score_train = sclf.score(X_train, y_train)
print('Meta Classifier score (Training Data):', sclf_score_train)
sclf_score = sclf.score(X_test, y_test)
print('Meta Classifier score (Testing Data):', sclf_score)
sclf_pred = sclf.predict(X_test)
sclf_cm = confusion_matrix(y_test, sclf_pred)
print('True Possitive = ', sclf_cm[1][1])
print('True Negative = ', sclf_cm[0][0])
print('False Possive = ', sclf_cm[0][1])
print('False Negative = ', sclf_cm[1][0])
print("Meta Classifier - Classification Report")
print(classification_report(y_test, sclf_pred, labels=[1, 0]))
probs = sclf.predict_proba(X_test)
preds = probs[:,1]
sclf_Accuracy = accuracy_score(y_test, sclf_pred)
print('sclf_Accuracy =', sclf_Accuracy)
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
sclf_Gini = 2*roc_auc - 1
print('sclf_Gini =', sclf_Gini)
sclf_AUC = roc_auc
print('sclf_AUC =', sclf_AUC)
sns.heatmap(confusion_matrix(y_test,sclf_pred), annot=True, cmap='BuGn_r',fmt='g')
prec_meta = (sclf_cm[1][1])/(sclf_cm[1][1] + sclf_cm[0][1])
rec_meta = (sclf_cm[1][1])/(sclf_cm[1][1] + sclf_cm[1][0])
F1_meta = 2*(prec_meta * rec_meta)/(prec_meta + rec_meta)
print('Precision for Meta classifier :', round(prec_meta,2))
print('Recall for Meta classifier :', round(rec_meta, 2))
print('F1 score for Meta classifier :', round(F1_meta, 2))
dt_model = DecisionTreeClassifier(max_depth=3, max_leaf_nodes=5, criterion = 'entropy')
dt_model.fit(X_train, y_train)
dt_score_train = dt_model.score(X_train, y_train)
print('DT model score (Training Data):', dt_score_train)
dt_score = dt_model.score(X_test, y_test)
print('DT model score (Testing Data):', dt_score)
dt_pred = dt_model.predict(X_test)
dt_cm = confusion_matrix(y_test, dt_pred)
print('True Possitive = ', dt_cm[1][1])
print('True Negative = ', dt_cm[0][0])
print('False Possive = ', dt_cm[0][1])
print('False Negative = ', dt_cm[1][0])
print("Decision Tree - Classification Report")
print(classification_report(y_test, dt_pred, labels=[1, 0]))
probs = dt_model.predict_proba(X_test)
preds = probs[:,1]
DT_Accuracy = accuracy_score(y_test, dt_pred)
print('DT_Accuracy =', DT_Accuracy)
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
DT_Gini = 2*roc_auc - 1
print('DT_Gini =', DT_Gini)
DT_AUC = roc_auc
print('DT_AUC =', DT_AUC)
sns.heatmap(confusion_matrix(y_test,dt_pred), annot=True, cmap='Purples',fmt='g')
prec_dt = (dt_cm[1][1])/(dt_cm[1][1] + dt_cm[0][1])
rec_dt = (dt_cm[1][1])/(dt_cm[1][1] + dt_cm[1][0])
F1_dt = 2*(prec_dt * rec_dt)/(prec_dt + rec_dt)
print('Precision for DT classifier :', round(prec_dt,2))
print('Recall for DT classifier :', round(rec_dt, 2))
print('F1 score for DT classifier :', round(F1_dt, 2))
rf_model = RandomForestClassifier(n_estimators=10, criterion='entropy', max_depth=3, max_leaf_nodes=5)
rf_model.fit(X_train, y_train)
rf_score_train = rf_model.score(X_train, y_train)
print('RF model score (Training Data):', rf_score_train)
rf_score = rf_model.score(X_test, y_test)
print('RF model score (Testing Data):', rf_score)
rf_pred = rf_model.predict(X_test)
rf_cm = confusion_matrix(y_test, rf_pred)
print('True Possitive = ', rf_cm[1][1])
print('True Negative = ', rf_cm[0][0])
print('False Possive = ', rf_cm[0][1])
print('False Negative = ', rf_cm[1][0])
print("Random Forest - Classification Report")
print(classification_report(y_test, rf_pred, labels=[1, 0]))
probs = rf_model.predict_proba(X_test)
preds = probs[:,1]
RF_Accuracy = accuracy_score(y_test, rf_pred)
print('RF_Accuracy =', RF_Accuracy)
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
RF_Gini = 2*roc_auc - 1
print('RF_Gini =', RF_Gini)
RF_AUC = roc_auc
print('RF_AUC =', RF_AUC)
sns.heatmap(confusion_matrix(y_test,rf_pred), annot=True, cmap='Oranges',fmt='g')
prec_rf = (rf_cm[1][1])/(rf_cm[1][1] + rf_cm[0][1])
rec_rf = (rf_cm[1][1])/(rf_cm[1][1] + rf_cm[1][0])
F1_rf = 2*(prec_rf * rec_rf)/(prec_rf + rec_rf)
print('Precision for RF classifier :', round(prec_rf,2))
print('Recall for RF classifier :', round(rec_rf, 2))
print('F1 score for RF classifier :', round(F1_rf, 2))
helper = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=3, max_leaf_nodes=5)
bag_model = BaggingClassifier(base_estimator=helper, n_estimators=100)
bag_model.fit(X_train, y_train)
bag_score_train = bag_model.score(X_train, y_train)
print('Bagging model score (Training Data):', bag_score_train)
bag_score = bag_model.score(X_test, y_test)
print('Bagging model score (Testing Data):', bag_score)
bag_pred = bag_model.predict(X_test)
bag_cm = confusion_matrix(y_test, bag_pred)
print('True Possitive = ', bag_cm[1][1])
print('True Negative = ', bag_cm[0][0])
print('False Possive = ', bag_cm[0][1])
print('False Negative = ', bag_cm[1][0])
print("Bagging - Classification Report")
print(classification_report(y_test, bag_pred, labels=[1, 0]))
probs = bag_model.predict_proba(X_test)
preds = probs[:,1]
BAG_Accuracy = accuracy_score(y_test, bag_pred)
print('BAG_Accuracy =', BAG_Accuracy)
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
BAG_Gini = 2*roc_auc - 1
print('BAG_Gini =', BAG_Gini)
BAG_AUC = roc_auc
print('BAG_AUC =', BAG_AUC)
sns.heatmap(confusion_matrix(y_test,bag_pred), annot=True, cmap='Pastel2',fmt='g')
prec_bag = (bag_cm[1][1])/(bag_cm[1][1] + bag_cm[0][1])
rec_bag = (bag_cm[1][1])/(bag_cm[1][1] + bag_cm[1][0])
F1_bag = 2*(prec_bag * rec_bag)/(prec_bag + rec_bag)
print('Precision for Bagging classifier :', round(prec_bag,2))
print('Recall for Bagging classifier :', round(rec_bag, 2))
print('F1 score for Bagging classifier :', round(F1_bag, 2))
Adaptive Boosting
helper = RandomForestClassifier(n_estimators=150, max_depth=5, max_leaf_nodes=3)
#helper = DecisionTreeClassifier(criterion='entropy')
#helper = LogisticRegression()
Aboost_model = AdaBoostClassifier(n_estimators= 500, base_estimator=helper, learning_rate=1, random_state=1)
Aboost_model.fit(X_train, y_train)
Aboost_score_train = Aboost_model.score(X_train, y_train)
print('Ada Boost model score (Training Data):', Aboost_score_train)
Aboost_score = Aboost_model.score(X_test, y_test)
print('Ada Boost model score (Testing Data):', Aboost_score)
Aboost_pred = Aboost_model.predict(X_test)
Aboost_cm = confusion_matrix(y_test, Aboost_pred)
print('True Possitive = ', Aboost_cm[1][1])
print('True Negative = ', Aboost_cm[0][0])
print('False Possive = ', Aboost_cm[0][1])
print('False Negative = ', Aboost_cm[1][0])
print("Ada Boost - Classification Report")
print(classification_report(y_test, Aboost_pred, labels=[1, 0]))
probs = Aboost_model.predict_proba(X_test)
preds = probs[:,1]
ABOOST_Accuracy = accuracy_score(y_test, Aboost_pred)
print('Ada BOOST_Accuracy =', ABOOST_Accuracy)
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
ABOOST_Gini = 2*roc_auc - 1
print('Ada BOOST_Gini =', ABOOST_Gini)
ABOOST_AUC = roc_auc
print('Ada BOOST_AUC =', ABOOST_AUC)
sns.heatmap(confusion_matrix(y_test,Aboost_pred), annot=True, cmap='coolwarm',fmt='g')
prec_aboost = (Aboost_cm[1][1])/(Aboost_cm[1][1] + Aboost_cm[0][1])
rec_aboost = (Aboost_cm[1][1])/(Aboost_cm[1][1] + Aboost_cm[1][0])
F1_aboost = 2*(prec_aboost * rec_aboost)/(prec_aboost + rec_aboost)
print('Precision for AdaBoost classifier :', round(prec_aboost,2))
print('Recall for AdaBoost classifier :', round(rec_aboost, 2))
print('F1 score for AdaBoost classifier :', round(F1_aboost, 2))
Gradient Boosting
Gboost_model = GradientBoostingClassifier(n_estimators= 250, learning_rate=1, max_depth=5, max_leaf_nodes=3)
Gboost_model.fit(X_train, y_train)
Gboost_score_train = Gboost_model.score(X_train, y_train)
print('GRAD Boost model score (Training Data):', Gboost_score_train)
Gboost_score = Gboost_model.score(X_test, y_test)
print('GRAD Boost model score (Testing Data):', Gboost_score)
Gboost_pred = Gboost_model.predict(X_test)
Gboost_cm = confusion_matrix(y_test, Gboost_pred)
print('True Possitive = ', Gboost_cm[1][1])
print('True Negative = ', Gboost_cm[0][0])
print('False Possive = ', Gboost_cm[0][1])
print('False Negative = ', Gboost_cm[1][0])
print("GRAD Boost - Classification Report")
print(classification_report(y_test, Gboost_pred, labels=[1, 0]))
probs = Gboost_model.predict_proba(X_test)
preds = probs[:,1]
GBOOST_Accuracy = accuracy_score(y_test, Gboost_pred)
print('GRAD BOOST_Accuracy =', GBOOST_Accuracy)
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
GBOOST_Gini = 2*roc_auc - 1
print('GRAD BOOST_Gini =', GBOOST_Gini)
GBOOST_AUC = roc_auc
print('GRAD BOOST_AUC =', GBOOST_AUC)
sns.heatmap(confusion_matrix(y_test,Gboost_pred), annot=True, cmap='coolwarm',fmt='g')
prec_gboost = (Gboost_cm[1][1])/(Gboost_cm[1][1] + Gboost_cm[0][1])
rec_gboost = (Gboost_cm[1][1])/(Gboost_cm[1][1] + Gboost_cm[1][0])
F1_gboost = 2*(prec_gboost * rec_gboost)/(prec_gboost + rec_gboost)
print('Precision for G-Boost classifier :', round(prec_gboost,2))
print('Recall for G-Boost classifier :', round(rec_gboost, 2))
print('F1 score for G-Boost classifier :', round(F1_gboost, 2))
Compare = [[Logistic_Accuracy,Logistic_Gini,Logistic_AUC, prec_log, rec_log, F1_log],
[NB_Accuracy,NB_Gini,NB_AUC, prec_nb, rec_nb, F1_nb],
[SVM_Accuracy_Linear,SVM_Gini_Linear,SVM_AUC_Linear, prec_svm_l, rec_svm_l, F1_svm_l],
[SVM_Accuracy,SVM_Gini,SVM_AUC,prec_svm_r, rec_svm_r, F1_svm_r],
[DT_Accuracy,DT_Gini,DT_AUC, prec_dt, rec_dt, F1_dt],
[RF_Accuracy,RF_Gini,RF_AUC,prec_rf, rec_rf, F1_rf],
[BAG_Accuracy,BAG_Gini,BAG_AUC, prec_bag, rec_bag, F1_bag],
[ABOOST_Accuracy,ABOOST_Gini,ABOOST_AUC, prec_aboost, rec_aboost, F1_aboost],
[GBOOST_Accuracy,GBOOST_Gini,GBOOST_AUC,prec_gboost, rec_gboost, F1_gboost],
[sclf_Accuracy,sclf_Gini,sclf_AUC, prec_meta, rec_meta, F1_meta]
]
pd.DataFrame(Compare,
columns= ['Accuracy','Gini Index', 'AUC', 'Precision', 'Recall', 'F1 score'],
index=['Logistic Regression','NB','SVM (Linear)','SVM (rbf)','Decision Tree',
'Random Forest', 'Bagging','ADA-Boosting', 'GRAD-Boosting', 'Meta-sclf'])
legend = ['LR', 'NB', 'SVM(L)', 'SVM(R)', 'Meta', 'DT', 'RF', 'BAG', 'ABOO', 'GBOO']
scores_test = [log_score, nb_score, svm_linear_score,
svm_score_rbf, sclf_score, dt_score, rf_score, bag_score, Aboost_score, Gboost_score]
scores_train = [log_score_train, nb_score_train, svm_score_linear_train, svm_score_rbf_train, sclf_score_train,
dt_score_train, rf_score_train, bag_score_train, Aboost_score_train, Gboost_score_train]
plt.figure(figsize=(16, 3))
plt.title('Accuracy comparison')
sns.pointplot(legend, scores_test, color = 'red', label = 'Test Accuracy')
sns.pointplot(legend, scores_train, color ='blue', label = 'Training Accuracy')
Accuracy scores of many models are similar but SVM(rbf) and Ada-Boost classifier has the best Test Accuracy (red). But in case of dataset where there is sample bais/class imbalance, accuracy is not the best evaluation metric. We need to look at Precision, Recall and F1 scores along with the AUC and Gini scores
prec_test = [prec_log, prec_nb, prec_svm_l, prec_svm_r, prec_meta, prec_dt, prec_rf, prec_bag, prec_aboost, prec_gboost]
plt.figure(figsize=(16, 2))
plt.title('Precision comparison')
sns.pointplot(legend, prec_test, color ='orange')
rec_test = [rec_log, rec_nb, rec_svm_l, rec_svm_r, rec_meta, rec_dt, rec_rf, rec_bag, rec_aboost, rec_gboost]
plt.figure(figsize=(16, 2))
plt.title('Recall comparison')
sns.pointplot(legend, rec_test, color ='violet')
F1_test = [F1_log, F1_nb, F1_svm_l, F1_svm_r, F1_meta, F1_dt, F1_rf, F1_bag, F1_aboost, F1_gboost]
plt.figure(figsize=(16, 2))
plt.title('F1 scores comparison')
sns.pointplot(legend, F1_test, color ='brown')
The overall F1 score (i.e the harmonic mean of Precision and Recall) is the best for Ada boost model with SVM (rbf) coming a close second. This inclines us more in favour of Adaptive boosting
gini_test = [Logistic_Gini, NB_Gini, SVM_Gini,
SVM_Gini_Linear, sclf_Gini, DT_Gini, RF_Gini, BAG_Gini, ABOOST_Gini, GBOOST_Gini]
gini_leg=['log_gini', 'nb_gini', 'svm_linear_gini', 'svm_rbf_gini', 'meta_class_gini', 'DT_gini',
'RF_gini', 'BAG_gini', 'A-Boost_gini', 'G-Boost_gini']
plt.figure(figsize=(16, 2))
plt.title('Gini comparison')
sns.pointplot(gini_leg, gini_test, color ='purple')
Gini scores for SVM(rbf) and AdaBoost classifiers are similar and best among others. However, Ada-boost scores just edges out SVM
auc_test = [Logistic_AUC, NB_AUC, SVM_AUC,
SVM_AUC_Linear, sclf_AUC, DT_AUC, RF_AUC, BAG_AUC, ABOOST_AUC, GBOOST_AUC]
auc_leg=['log_AUC', 'nb_AUC', 'svm_linear_AUC', 'svm_rbf_AUC', 'meta_class_AUC', 'DT_AUC',
'RF_AUC', 'BAG_AUC', 'A-Boost_AUC', 'G-BOOST_AUC']
plt.figure(figsize=(16, 2))
plt.title('AUC comparison')
sns.pointplot(auc_leg, auc_test, color ='green')
ADABoost model here clearly has the best AUC score among all the models
We would choose Adaptive Boosting method because of the following reasons -
As seen from the above plots. SVN (rbf kernel) also gives the same accuracy as Ada Boosting, but the later has slightly better Gini and AUC scores. Random Forest, Gradient Boost and Bagging Classifiers have similar Accuracy scores Decision Tree, Logistic regression and SVM(linear kernel) have similar Accuracy scores